#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# QQ: 34538980@qq.com
# Jekkay Hu, 2013.5.5
# ///////////////////////////////////////////////////////////////////
#                            _ooOoo_                               //
#                           o8888888o                              //
#                           88" . "88                              //
#                           (| ^_^ |)                              //
#                           O\  =  /O                              //
#                        ____/`---'\____                           //
#                      .'  \\|     |//  `.                         //
#                     /  \\|||  :  |||//  \                        //
#                    /  _||||| -:- |||||-  \                       //
#                    |   | \\\  -  /// |   |                       //
#                    | \_|  ''\---/''  |   |                       //
#                    \  .-\__  `-`  ___/-. /                       //
#                  ___`. .'  /--.--\  `. . ___                     //
#                ."" '<  `.___\_<|>_/___.'  >'"".                  //
#              | | :  `- \`.;`\ _ /`;.`/ - ` : | |                 //
#              \  \ `-.   \_ __\ /__ _/   .-` /  /                 //
#        ========`-.____`-.___\_____/___.-`____.-'========         //
#                             `=---='                              //
#        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^        //
#             佛祖保佑       永无BUG        运行正常                   //
#     -------------------------------------------------------      //
#               QQ: 34538980@qq.com                                //
#               博客: http://www.easysb.cn                          //
#               Jekkay Hu, 2013.5.5                                //
# ///////////////////////////////////////////////////////////////////
import json
import logging
from os import path, makedirs
import urllib
import urllib.request
import re
import hashlib
import time
import random
import string
from utility.charset import Charset
import zlib

__rand_count = 100
# 对域名的处理
suffixes = ['ac', 'ad', 'ae', 'aero', 'af', 'ag', 'ai', 'al', 'am', 'an', 'ao', 'aq', 'ar', 'arpa', 'as',
            'asia', 'at', 'au', 'aw', 'ax', 'az', 'ba', 'bb', 'bd', 'be', 'bf', 'bg', 'bh', 'bi', 'biz',
            'bj', 'bm', 'bn', 'bo', 'br', 'bs', 'bt', 'bv', 'bw', 'by', 'bz', 'ca', 'cat', 'cc', 'cd', 'cf',
            'cg', 'ch', 'ci', 'ck', 'cl', 'cm', 'cn', 'co', 'com', 'coop', 'cr', 'cu', 'cv', 'cx', 'cy',
            'cz', 'de', 'dj', 'dk', 'dm', 'do', 'dz', 'ec', 'edu', 'ee', 'eg', 'er', 'es', 'et', 'eu', 'fi',
            'fj', 'fk', 'fm', 'fo', 'fr', 'ga', 'gb', 'gd', 'ge', 'gf', 'gg', 'gh', 'gi', 'gl', 'gm', 'gn',
            'gov', 'gp', 'gq', 'gr', 'gs', 'gt', 'gu', 'gw', 'gy', 'hk', 'hm', 'hn', 'hr', 'ht', 'hu', 'id',
            'ie', 'il', 'im', 'in', 'info', 'int', 'io', 'iq', 'ir', 'is', 'it', 'je', 'jm', 'jo', 'jobs',
            'jp', 'ke', 'kg', 'kh', 'ki', 'km', 'kn', 'kp', 'kr', 'kw', 'ky', 'kz', 'la', 'lb', 'lc', 'li',
            'lk', 'lr', 'ls', 'lt', 'lu', 'lv', 'ly', 'ma', 'mc', 'md', 'me', 'mg', 'mh', 'mil', 'mk', 'ml',
            'mm', 'mn', 'mo', 'mobi', 'mp', 'mq', 'mr', 'ms', 'mt', 'mu', 'mv', 'mw', 'mx', 'my', 'mz', 'na',
            'name', 'nc', 'ne', 'net', 'nf', 'ng', 'ni', 'nl', 'no', 'np', 'nr', 'nu', 'nz', 'om', 'org', 'pa',
            'pe', 'pf', 'pg', 'ph', 'pk', 'pl', 'pm', 'pn', 'pr', 'pro', 'ps', 'pt', 'pw', 'py', 'qa', 're', 'ro',
            'rs', 'ru', 'rw', 'sa', 'sb', 'sc', 'sd', 'se', 'sg', 'sh', 'si', 'sj', 'sk', 'sl', 'sm', 'sn', 'so',
            'sr', 'st', 'su', 'sv', 'sy', 'sz', 'tc', 'td', 'tel', 'tf', 'tg', 'th', 'tj', 'tk', 'tl', 'tm', 'tn',
            'to', 'tp', 'tr', 'tt', 'tv', 'tw', 'tz', 'ua', 'ug', 'uk', 'us', 'uy', 'uz', 'va', 'vc', 've', 'vg',
            'vi', 'vn', 'vu', 'wf', 'ws', 'xn', 'ye', 'yt', 'za', 'zm', 'zw']


def is_subdomain(domain):
    # 判断域名是否是二级域名 1是0否
    sdomain = []
    bdomain = False
    for section in domain.split('.'):
        if section in suffixes:
            sdomain.append(section)
            bdomain = True
        else:
            sdomain = [section]
    if bdomain:
        return (len(domain.split('.')) - len(sdomain)) >= 1


def get_root_domain(domain):
    # 域名拆解www.baidu.com->baidu.com
    if not domain:
        return
    sdomain = []
    bdomain = False
    for section in domain.split('.'):
        if section in suffixes:
            sdomain.append(section)
            bdomain = True
        else:
            sdomain = [section]
    return '.'.join(sdomain) if bdomain else domain


def get_domain_suffix(domain):
    # 获取域名的后辍名
    sdomain = []
    bdomain = False
    for section in domain.split('.'):
        if section in suffixes:
            sdomain.append(section)
            bdomain = True
        else:
            sdomain = []
    return '.'.join(sdomain) if bdomain else None


def is_legal_domain(domain):
    # 排除一些违规域名
    try:
        # 判断 域名长度应该大于>=4字节为真域名
        if len(domain) < 4:
            return 0
        for ch in list(domain):
            if ch == '.' or ch == '-':
                continue
            if ch.isdigit() or ch.isalpha():
                continue
            return
        # 域名只要有两部分
        ps = domain.split('.')
        if len(ps) <= 1:
            return
        for p in ps:
            # 域名单词为空或者超过20个，都直接扔掉，垃圾域名意义不大
            if not p or len(p) >= 20:
                return
        # 最后一个后缀应该是在列表中
        global suffixes
        return ps[len(ps) - 1] in suffixes
    except Exception as e:
        logging.error('is_legal_domain exception %s' % str(e))


def extract_host_port(url):
    host = str(url).replace(' ', '')
    s = host.find('://')
    if s >= 0:
        host = host[s + 3:]
    e = host.find('/')
    if e >= 0:
        host = host[0:e]
    return host


def extract_host(url):
    host = str(url).replace(' ', '')
    s = host.find('://')
    if s >= 0:
        host = host[s + 3:]
    e = host.find(':')
    if e > 0:
        host = host[0:e]
    e = host.find('/')
    if e >= 0:
        host = host[0:e]
    return host


def extract_schema(url):
    host = str(url).replace(' ', '')
    s = host.find('://')
    return host[0:s] if s > 0 else 'http'


def extract_url_suffix(url):
    s = url.find('?')
    if s > 0:
        url = url[:s]
    s = url.rfind('/')
    if s > 0:
        url = url[s:]
    s = url.rfind('.')
    return url[s + len('.'):] if s >= 0 else None


# 把title之间的取出来
def extract_title(content):
    try:
        if not content:
            return ''
        content = str(content)
        s = content.find('<title>')
        if s < 0:
            s = content.find('<TITLE>')
        if s < 0:
            return ''
        s += len('<title>')
        e = content.find('</', s + 1)
        content = content[s:e]
        # 将空白字符串和换行符号去掉 最多200个字符串
        return content.replace(' ', '').replace('\r', '').replace('\n', '')[:200] if content else None
    except Exception as e:
        logging.warning('extract_title exception %s' % str(e))


def format_url(url):
    try:
        s = url.find('://')
        if s < 0:
            url = "http://" + url
        if url.count('/') == 2:
            url += '/'
        return url
    except Exception as e:
        logging.error('[format_url] Exception %s' % e)


def get_python_exe_path():
    import sys
    return sys.executable


def get_python_home():
    python_exe_path = get_python_exe_path()
    if not python_exe_path:
        return
    folder = path.split(path.abspath(python_exe_path))[0]
    return folder


def get_python_script_home():
    return path.join(get_python_home(), "Scripts")


def set_log(file_path, level):
    if not file_path:
        return
    log_folder = path.split(file_path)[0]
    if not log_folder:
        return
    if not path.isdir(log_folder):
        makedirs(log_folder)
    logging.basicConfig(filename=file_path,
                        filemode='a+',
                        format='[%(process)d-%(threadName)s]%(asctime)s '
                               '%(module)s.%(funcName)s:%(lineno)s %(levelname)s:%(message)s',
                        datefmt='%m/%d/%Y %I:%M:%S %p',
                        level=level)


def make_parent_folder(file_path):
    folder = path.split(file_path)[0]
    if not folder:
        return
    if not path.isdir(folder):
        makedirs(folder)


def gzdecode(data):
    try:
        return zlib.decompress(data, 16+zlib.MAX_WBITS)
    except Exception as e:
        logging.error('[gzdecode] Exception %s' % e)


def get_dict_value(dictvar, key, default=None):
    try:
        if isinstance(dictvar, dict):
            return dictvar[key]
    except Exception as e:
        pass
    return default


def levenshtein(a, b):
    """Calculates the Levenshtein distance between a and b."""
    n, m = len(a), len(b)
    if n > m:
        # Make sure n <= m, to use O(min(n,m)) space
        a, b = b, a
        n, m = m, n
    current = range(n + 1)
    for i in range(1, m + 1):
        previous, current = current, [i] + [0] * n
        for j in range(1, n + 1):
            add, delete = previous[j] + 1, current[j - 1] + 1
            change = previous[j - 1]
            if a[j - 1] != b[i - 1]:
                change = change + 1
            current[j] = min(add, delete, change)
    return current[n]


def levenshtein_distance(first, second):
    """#字符串相似度算法  Find the Levenshtein distance between two strings."""
    if len(first) > len(second):
        first, second = second, first
    if len(second) == 0:
        return len(first)
    first_length = len(first) + 1
    second_length = len(second) + 1
    distance_matrix = [range(second_length) for x in range(first_length)]
    for i in range(1, first_length):
        for j in range(1, second_length):
            deletion = distance_matrix[i - 1][j] + 1
            insertion = distance_matrix[i][j - 1] + 1
            substitution = distance_matrix[i - 1][j - 1]
            if first[i - 1] != second[j - 1]:
                substitution += 1
            distance_matrix[i][j] = min(insertion, deletion, substitution)
    return distance_matrix[first_length - 1][second_length - 1]


def load_dict_from_file(filepath):
    try:
        item_list = []
        with open(filepath, mode='r', encoding='utf-8') as fd:
            while True:
                lines = fd.readlines()
                if not lines:
                    break
                read_lines = ([t.strip(" \r\n") for t in lines if t.strip(" \r\n")])
                for line in read_lines:
                    if not line.startswith('#') and item_list.count(line) == 0:
                        item_list.append(line)
        return item_list
    except Exception as e:
        logging.error('[load_dict_from_file]Fail to load file %s: %s' % (filepath, e))


def format_file_path_unix(file_path):
    if not file_path:
        return file_path
    file_path = str(file_path).replace("\\", "/")
    if file_path.find(":") >= 0:
        file_path = file_path.replace(":", "")
        file_path = "/" + file_path
    return file_path


def get_url(url):
    try:
        opener = urllib.request.build_opener()
        opener.addheaders = [
            ('User-agent',
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'),
            ('Referer', '%s://%s' % (extract_schema(url), extract_host(url))),
            ('Sec-Fetch-Mode', 'no-cors'),
            ('Accept-encoding', 'gzip')
        ]
        response = opener.open(url)
        html = response.read()
        if response.headers and response.headers['Content-Encoding'] == 'gzip':
            html = gzdecode(html)
        return Charset.decode(html)
    except Exception as e:
        pass


def get_file(url, tmp_file_path):
    try:
        make_parent_folder(tmp_file_path)
        opener = urllib.request.build_opener()
        opener.addheaders = [
            ('User-agent',
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'),
            ('Referer', '%s://%s' % (extract_schema(url), extract_host(url))),
            ('Sec-Fetch-Mode', 'no-cors'),
            ('Accept-encoding', 'gzip')
        ]
        c = opener.open(url).read()
        with open(tmp_file_path, "wb") as fd:
            fd.write(c)
        return True
    except Exception as e:
        pass


def extract_all_links(content):
    return re.findall('((http|ftp)s?://[a-zA-z0-9_\.\-/\?\=]+)', content)


def random_string():
    return ''.join(random.sample(string.ascii_letters + string.digits, 20))


def random_md5():
    global __rand_count
    __rand_count += 1
    seed = '%s_%s_%s' % (time.time() * 1000, __rand_count, random_string())
    return calculate_md5(seed)


def calculate_md5(content):
    md5 = hashlib.md5()
    md5.update(bytes(content, encoding="utf-8"))
    return md5.hexdigest()


def calculate_file_md5(file_path):
    md5 = hashlib.md5()
    with open(file_path, mode='rb') as fd:
        md5.update(fd.read())
    return md5.hexdigest()


def load_json_file(file_path):
    return load_json(open(file_path, mode='r', encoding='utf-8').read())


def load_json(content):
    if not content.endswith('\n'):
        content += "\n"
    content = __strip_json_comment(content, '/*', '*/')
    content = __strip_json_comment(content, '//', '\n')
    return json.loads(content) if content else {}


def __strip_json_comment(content, comment_start, comment_end):
    pure_content = []
    s = -1
    e = -1
    pre = 0
    while s < len(content):
        s = content.find(comment_start, s) if s >= 0 else content.find(comment_start)
        if s < 0:
            pure_content.append(content[pre:])
            break
        e = content.find(comment_end, s)
        if e < 0:
            logging.error("invalid json format, no */ found after %s" % s)
            return
        if s > pre:
            pure_content.append(content[pre:s])
        s = e + len(comment_end)
        pre = s
    return ''.join(pure_content)
